# Load the imputed datasetdf_rq5_imputed<-readRDS(file.path("data", "df_rq5_imputed.Rds"))# Cleaned and put-together imputed data# Sample sizedf_rq5_imputed%>%filter(.imp==1)%>%nrow()
[1] 12976
Code
# Create comparison datasets - only rq5y variablesoriginal_dataset<-df%>%filter(!(randomfamid%in%exclude_fams_onesib))%>%filter(!(randomfamid%in%rq5_exclude_fams))%>%filter(!(randomfamid%in%rq5_exclude_fams_2))%>%# Exclude fams with less than 30% data on all imputed data (excluding baseline data)select(all_of(rq5y))# imputed_dataset <- df_rq5_imputed %>%# select(all_of(rq5y))
Descriptive stats
Missingness plots
Code
df%>%select(amohqualn,all_of(rq5y))%>%`colnames<-`(c("Y1: Mother Education", rq5y_labels_short))%>%# select(rq5y) %>%# `colnames<-`(rq5y_labels_short) %>%as.data.frame()%>%# Note that this is needed for function to work - could improve? gbtoolbox::plot_correlations( confidence_interval =FALSE, sample_size =FALSE)
Warning in gbtoolbox::plot_correlations(., confidence_interval = FALSE, : This function is in development, and not yet ready for widespread use.
Proceed with caution
Warning in gbtoolbox::plot_missing_correlations(., n_decimal_places = 2, : This function is in early beta, and not yet ready for widespread use.
Proceed with caution
Code
save_plot("5_3_descriptive_plot_missing_correlations_rq5y", width =5.3, height =5.3)# takes a long time to run with all the vairables in if(FALSE){df%>%select(any_of(c(rq5y, rq5z)))%>%select(where(is.numeric))%>%select(rq5y, everything())%>%as.data.frame()%>%gbtoolbox::plot_missing_correlations(cluster_variables =FALSE, textadjust =0)}
df%>%select(any_of(rq5z))%>%select(!ends_with("2"))%>%# because the data is in a long format, we don't need the twin 2 variables! apply(.,2,function(x)length(which(!is.na(x)))/length(x))%>%as.data.frame()%>%rownames_to_column()%>%`colnames<-`(c("var","percent_notmissing"))%>%mutate(# var = factor(var, levels = rq5z), var_label =sapply(var, function(x)ifelse(!is.null(var_to_label(x)[[1]]), var_to_label(x), x)), var_label =paste0(var_label, " (", var, ")"), var_label =factor(var_label, levels =var_label))%>%#pull(var_label) %>% duplicated() %>% table()arrange(percent_notmissing)%>%ggplot(aes(x =percent_notmissing, y =var_label))+geom_col()+geom_vline(aes(xintercept=.2))+theme_bw()+labs(x="Percent of not-missing data", y =NULL)
Code
# save_plot("5_3_descriptive_missing_data_frequency_auxillaryvars", width =12, height =32)df%>%select(any_of(rq5y))%>%# filter(rowSums(!is.na(.)) > 0) %>% apply(.,2,function(x)length(which(!is.na(x)))/length(x))%>%as.data.frame()%>%rownames_to_column()%>%`colnames<-`(c("var","percent_notmissing"))%>%mutate(var_label =factor(var, levels =rev(rq5y), labels =rev(rq5y_labels_short)))%>%ggplot(aes(x =percent_notmissing, y =var_label))+geom_col()+geom_vline(aes(xintercept=.2))+geom_text(aes(label =paste0(round(percent_notmissing*100),"%")), hjust=1.2)+theme_bw()+labs(x="Percent of not-missing data", y =NULL)
Code
save_plot("5_3_descriptive_missing_data_frequency", width =12, height =32)df%>%select(cohort, any_of(rq5y))%>%pivot_longer(cols =-cohort, names_to ="var", values_to ="value")%>%mutate(var =factor(var, levels =rev(rq5y), labels =rev(rq5y_labels_short)))%>%group_by(cohort, var)%>%summarise( total_n =dplyr::n(), not_missing_n =sum(!is.na(value)), percent_notmissing =not_missing_n/total_n, .groups ="drop")%>%mutate(# var_label = factor(var, levels = unique(var), labels = var_to_label(unique(var))), cohort =factor(cohort, levels =c("Cohort 4: twins born Sep-96 to Dec-96","Cohort 3: twins born Sep-95 to Aug-96","Cohort 2: twins born Sep-94 to Aug-95","Cohort 1: twins born Jan-94 to Aug-94"), labels =c("4", "3", "2", "1")))%>%ggplot(aes(x =percent_notmissing, y =var, fill =cohort))+geom_col(position =position_dodge(width =0.8), width =0.7)+geom_vline(aes(xintercept =0.2), linetype ="dashed")+geom_text(aes(label =paste0(round(percent_notmissing*100), "%")), position =position_dodge(width =0.8), hjust =-.2, size =3)+theme_bw()+labs( x ="Percent of not-missing data BY COHORT", y =NULL, fill ="Cohort")+theme( legend.position ="bottom", axis.text.y =element_text(size =8))+guides(fill =guide_legend(reverse =TRUE))
Age in years of natural mother at time of birth of twins
numeric
98
4.81
31
Year 1 (1st Contact)
asingle
asingle
Single Parent
cohabiting biological mother and father / cohabiting biological parent with other, single parent
factor
98
0.26
2
Year 1 (1st Contact)
amohqualn
amohqualn
Maternal Education (formatted as numeric)
numeric
98
1.99
8
Year 1 (1st Contact)
amosoc2
amosoc2
Mother SOC employment level (1st Contact), 1-9
caring for children at home, 1, 2, 3, 4, 5, 6, 7, 8, 9, no job
factor
99
3.66
11
Year 1 (1st Contact)
alang
alang
Main language spoken at home (1st Contact), see value labels
other, English, English + other
factor
99
0.20
3
Year 1 (1st Contact)
cohort
cohort
School cohort, see value labels
Cohort 1: twins born Jan-94 to Aug-94, Cohort 2: twins born Sep-94 to Aug-95, Cohort 3: twins born Sep-95 to Aug-96, Cohort 4: twins born Sep-96 to Dec-96
factor
100
0.95
4
Year 3
aethnic
aethnic
Ethnic origin of twins, simplified coding (1st Contact), 1=white 0=other
1, 0
numeric
100
0.27
2
Year 1 (1st Contact)
anoldsibn
anoldsibn
Number of older siblings (formatted as numeric variable)
1, 0, 2, 3, 4, 5
numeric
100
0.96
6
Year 1 (1st Contact)
anyngsibn
anyngsibn
Number of younger siblings (formatted as numeric variable)
0, 1, 2
numeric
100
0.21
3
Year 1 (1st Contact)
asmoke
asmoke
Smoked cigarettes while pregnant (1st Contact), 1Y 0N
0, 1
factor
100
0.39
2
Year 1 (1st Contact)
Missing data proportions by group
Interestingly, MZ twins have more missing data than DZ twins
Code
# Check proportion of missing data by sexzyg groupdf%>%select(sexzyg, any_of(rq5y))%>%group_by(sexzyg)%>%summarise( n =dplyr::n(), total_cells =dplyr::n()*length(rq5y), missing_cells =sum(is.na(c_across(any_of(rq5y)))), overall_prop_missing =missing_cells/total_cells, .groups ="drop")%>%knitr::kable(digits =2)
Number of missing cells per participant for rq5y variables
For the variables G composite scale from child web tests at 12, standardised, G composite scale from child web tests at 14, standardised, G composite scale from child web tests at 16, standardised, G-game overall total score, 0-40, End of KS3 all-subject Academic achievement mean level (from parent SLQ), 1-9, Core subjects (English, maths, science): mean grade in GCSE results (twin exams at 16), 4-11, Twin probable highest level of qualification including current study (TEDS21 phase 1 twin qnr), 1-11 see value labels, Demographics item: highest qualification ordinal level (TEDS26 twin MHQ), see value labels, MFQ scale from 11 MFQ items (child self-report) at 12, 0-22, MFQ total scale (child behaviour qnr at 16), 0-26, MFQ overall total score (TEDS21 phase 1 twin qnr), 0-16, MFQ overall total score (TEDS26 twin MHQ), 0-26, General Anxiety overall total score (TEDS21 phase 2 twin qnr), 0-40, GAD-D (General Anxiety) overall total score (TEDS26 twin MHQ), 0-40, SDQ Externalising scale at 12, SDQ Externalising scale at 16, SDQ Externalising scale at 21, SDQ Externalising scale at 26, how many cells is each participant missing.
Code
df%>%select(any_of(rq5y))%>%mutate(missing_count =rowSums(is.na(.)))%>%select(missing_count)%>%count(missing_count)%>%mutate( percent =round(n/sum(n)*100, 1), total_vars =length(rq5y))%>%knitr::kable( col.names =c("# Missing Variables Per Pps", "N pps", "% pps", "Total Variables"), caption ="How many missing cells does each participant have on the key outcome variables? ")
How many missing cells does each participant have on the key outcome variables?
Note. Est = Estimate, LB = Lower Bound 95% Confidence Interval, UB = Upper Bound 95% Confidence Interval. Significant (pBonferroni-Holm) effects are highlighted in green (increases) or red (decreases).
THIS PLOT NEEDS TO BE UPDATED AS WE DIDN”TINCLUDE ALL PARTICIPANTS IN df IN THE ANALYSES
Code
df%>%filter(!(randomfamid%in%exclude_fams_onesib))%>%# Exclude fams with one sub in the studyfilter(!(randomfamid%in%rq5_exclude_fams))%>%# Exclude fams with 0 data on rq5y variables (key outcomes)filter(!(randomfamid%in%rq5_exclude_fams_2))%>%select(all_of(rq5y))%>%data.frame()%>%`colnames<-`(rq5y_labels_short)%>%gbtoolbox::plot_correlations( confidence_interval =FALSE, text_size_r =2.5, abs_colour =FALSE)+labs( title ="Unimputed Pairwise correlations", subtitle ="Below Diagonal: Correlations. Diagonal: Univariate Sample Sizes. Above Diagonal: Pairwise Sample Sizes", caption =NULL, tag ="B1")+theme( plot.title =element_text(hjust =0.5, size =16, face ="plain"), plot.subtitle =element_text(hjust =0, size =8, margin =margin(b =0)), plot.caption =element_text(hjust =1, size =8), plot.tag =element_text(hjust =0, vjust =0, size =30, face ="bold"), plot.tag.position ="topleft", plot.background =element_blank())+scale_fill_gradient2( low ="#0571b0", mid ="white", high ="#ca0020", midpoint =0, limits =c(-.9, .9), # Set min and max here na.value ="white")
Warning in gbtoolbox::plot_correlations(., confidence_interval = FALSE, : This function is in development, and not yet ready for widespread use.
Proceed with caution
Scale for fill is already present.
Adding another scale for fill, which will replace the existing scale.
Code
save_plot("5_correlations_unimputed", width =8, height =8, trim =TRUE)
1 P values are Bonferroni-Holm adjusted within each group (Original, Imputed, Difference)
Plot
Code
bootstrap_summary_df_ace%>%filter(par%in%c("a","c","e"))%>%filter(group!="Difference")%>%mutate( name =factor(rq5y_labels_short[match(name, rq5y)], levels =rq5y_labels_short), group =ifelse(group=="Imputed", "I", "O"), group =factor(group, levels =c("I", "O")), sex =str_to_title(sex), par =toupper(par))%>%ggplot(aes(x =group, y =y, fill =par))+geom_col(position ="stack", alpha =1)+facet_grid(sex~name, switch ="x")+theme_minimal()+theme( axis.text.x =element_text(angle =0, hjust =.5, size =10), strip.text =element_text(size =10), strip.text.x =element_text(angle =90, hjust =1, vjust =.5, size =10), strip.text.y =element_text(angle =0, size =11), strip.placement ="outside", panel.grid =element_blank(), plot.tag =element_text(hjust =0, vjust =0, size =24, face ="bold"), plot.tag.position ="topleft")+labs( title ="Imputed (I) or Original (Non-Imputed; O) ACE Estimates", y ="Proportion", x =NULL, fill ="Component", tag ="B")+scale_fill_manual(values =c("A"="#d73027", "C"="#fee08b", "E"="#4575b4"))+coord_cartesian(ylim =c(0, 1))
Code
save_plot("5_ace_comparison", width =8, height =5, trim =TRUE)
Supplementary Tables & Plots
Comparison of sample size pre- and post-imputation
Code
p1=df%>%select(all_of(rq5y))%>%`colnames<-`(c(rq5y_labels_short))%>%# select(rq5y) %>%# `colnames<-`(rq5y_labels_short) %>%as.data.frame()%>%# Note that this is needed for function to work - could improve? gbtoolbox::plot_correlations( confidence_interval =FALSE, sample_size =TRUE, text_size_r =1.5, text_size_axis =8.5)+labs(title ="Original (non-imputed data)")
Warning in gbtoolbox::plot_correlations(., confidence_interval = FALSE, : This function is in development, and not yet ready for widespread use.
Proceed with caution
Code
p2=df_rq5_imputed%>%filter(.imp==1)%>%select(all_of(rq5y))%>%`colnames<-`(c(rq5y_labels_short))%>%# select(rq5y) %>%# `colnames<-`(rq5y_labels_short) %>%as.data.frame()%>%# Note that this is needed for function to work - could improve? gbtoolbox::plot_correlations( confidence_interval =FALSE, sample_size =TRUE, text_size_r =1.5, text_size_axis =8.5)+labs(title ="Imputed data")
Warning in gbtoolbox::plot_correlations(., confidence_interval = FALSE, : This function is in development, and not yet ready for widespread use.
Proceed with caution
# Function to map variable prefix to study waveget_study_wave=function(var_name){prefix=substr(var_name, 1, 1)wave_map=c("a"="1st Contact","b"="2 Year", "c"="3 Year","d"="4 Year","e"="In Home","g"="7 Year","h"="8 Year", "i"="9 Year","j"="10 Year","l"="12 Year","n"="14 Year","p"="16 Year","r"="18 Year","u"="21 Year","z"="26 Year")return(wave_map[prefix])}impute_vars=readRDS(file.path("results","5_1_imputation_variables.Rds"))impute_vars_labels=var_to_label(impute_vars)%>%sapply(., function(x)ifelse(is.null(x[1]), "", x[1]))# Create formatted table for imputation variablesv_impute=data.frame( Description =impute_vars_labels, `Teds Code` =ifelse(impute_vars%in%original_colnames, impute_vars, paste0(impute_vars,"*")), `Range or Level` =sapply(impute_vars, function(var){if(var%in%colnames(df)){if(class(df[[var]])=="numeric"){paste0(round(min(df[[var]], na.rm =TRUE), 2), " — ", round(max(df[[var]], na.rm =TRUE), 2))}elseif(is.factor(df[[var]])){factor_levels=levels(df[[var]])paste(c(paste0(factor_levels[1],"*"), factor_levels[-1]), collapse =", ")}else{paste(unique(df[[var]]), collapse =", ")}}else{"Variable not found"}}), N =sapply(impute_vars, function(var){if(var%in%colnames(df)){sum(!is.na(df[[var]]))}else{0}}), `Study.Wave` =sapply(impute_vars, get_study_wave))# Clean up descriptions# v_impute$Description = str_remove(v_impute$Description, "\\(1st.*")# v_impute$Description = str_remove(v_impute$Description, "\\(2.*")# Manual edit for specific variablesv_impute$`Study.Wave`[v_impute$`Teds.Code`=="cens01pop98density"]="1st Contact"v_impute$`Study.Wave`[v_impute$`Teds.Code`=="pollution1998pca"]="1st Contact"# Add row group information# v_impute_indexed = cbind(row_group = "Imputation Variables", row_id = 1:nrow(v_impute), v_impute)gt(v_impute)%>%# tab_row_group(# label = "Imputation Variables",# rows = row_group == "Imputation Variables"# ) %>%# cols_hide(c(row_group, row_id)) %>%cols_label_with(fn =~gsub("\\.", " ", .x))%>%tab_style( style =cell_text(size =px(12)), locations =cells_body())%>%tab_style( style =cell_text(size =px(12)), locations =cells_column_labels())%>%tab_style( style =cell_text(size =px(12)), locations =cells_row_groups())%>%cols_hide(N)%>%tab_source_note( source_note ="Note: Externalising SDQ scores are created post-imputation by adding conduct and hyperactivity problem scales. Range or Level shows min—max values for numeric variables or factor levels for categorical variables (reference level marked with *). Variable codes with an asterisk (*) have been derived or modified from the original dataset.")%>%tab_options( table.width ="70%")%>%cols_width(Description~px(300),everything()~px(120))
Description
Teds Code
Range or Level
Study Wave
School cohort, see value labels
cohort
Cohort 1: twins born Jan-94 to Aug-94*, Cohort 2: twins born Sep-94 to Aug-95, Cohort 3: twins born Sep-95 to Aug-96, Cohort 4: twins born Sep-96 to Dec-96
3 Year
Age in years of natural mother at time of birth of twins
amumagetw
14 — 45
1st Contact
Single Parent
asingle*
cohabiting biological mother and father / cohabiting biological parent with other*, single parent
1st Contact
Mother SOC employment level (1st Contact), 1-9
amosoc2*
caring for children at home*, 1, 2, 3, 4, 5, 6, 7, 8, 9, no job
1st Contact
Maternal Education (formatted as numeric)
amohqualn*
1 — 8
1st Contact
Ethnic origin of twins, simplified coding (1st Contact), 1=white 0=other
aethnic
0 — 1
1st Contact
Main language spoken at home (1st Contact), see value labels
alang
other*, English, English + other
1st Contact
Number of older siblings (formatted as numeric variable)
anoldsibn*
0 — 5
1st Contact
Number of younger siblings (formatted as numeric variable)
anyngsibn*
0 — 2
1st Contact
Member of a Twins Club (1st Contact), 1Y 0N
atwclub
0*, 1
1st Contact
Census data 2001 (code KS001) linked to 1998 postcode: population density, N per hectare
cens01pop98density
0 — 310
1st Contact
Smoked cigarettes while pregnant (1st Contact), 1Y 0N
Child Verbal/Language composite (7 year twin phone), standardised
gcl1
-3.04 — 5.9
7 Year
Child Verbal/Language composite (7 year twin phone), standardised
gcl2
-3.04 — 5.9
7 Year
Child Non-verbal composite (7 year twin phone), standardised
gcn1
-4.32 — 3.15
7 Year
Child Non-verbal composite (7 year twin phone), standardised
gcn2
-4.32 — 3.15
7 Year
2-subject (maths, English) mean NC level (7 year teacher), 0-4
gt2ac1
0 — 4
7 Year
2-subject (maths, English) mean NC level (7 year teacher), 0-4
gt2ac2
0 — 4
7 Year
SDQ hyperactivity total (7 year parent), 0-10
gpsdqhypt1
0 — 10
7 Year
SDQ hyperactivity total (7 year parent), 0-10
gpsdqhypt2
0 — 10
7 Year
SDQ conduct total (7 year parent), 0-10
gpsdqcont1
0 — 10
7 Year
SDQ conduct total (7 year parent), 0-10
gpsdqcont2
0 — 10
7 Year
ARBQ overall anxiety total (7 year parent), 0-52
gpanxt1
0 — 46
7 Year
ARBQ overall anxiety total (7 year parent), 0-52
gpanxt2
0 — 46
7 Year
Conners ADHD overall Total at 8 (0-54)
hconnt1
0 — 54
8 Year
Conners ADHD overall Total at 8 (0-54)
hconnt2
0 — 54
8 Year
Verbal composite (9 year child), standardised
icvb1
-4.58 — 2.61
9 Year
Verbal composite (9 year child), standardised
icvb2
-4.58 — 2.61
9 Year
Non-Verbal composite (9 year child), standardised
icnv1
-3.68 — 1.31
9 Year
Non-Verbal composite (9 year child), standardised
icnv2
-3.68 — 1.31
9 Year
3-subject (English, maths, science) mean NC level (9 year teacher), 1-5
it3ac1
1 — 5
9 Year
SDQ Conduct scale (child self-report) at 9, 0-10
icsdqcont1
0 — 10
9 Year
SDQ Conduct scale (child self-report) at 9, 0-10
icsdqcont2
0 — 10
9 Year
SDQ Hyperactivity scale (child self-report) at 9, 0-10
icsdqhypt1
0 — 10
9 Year
SDQ Hyperactivity scale (child self-report) at 9, 0-10
icsdqhypt2
0 — 10
9 Year
SDQ Hyperactivity scale (parent) at 9, 0-10
ipsdqhypt1
0 — 10
9 Year
SDQ Hyperactivity scale (parent) at 9, 0-10
ipsdqhypt2
0 — 10
9 Year
SDQ Conduct scale (parent) at 9, 0-10
ipsdqcont1
0 — 10
9 Year
SDQ Conduct scale (parent) at 9, 0-10
ipsdqcont2
0 — 10
9 Year
Negative parental feelings scale, child-self-rated at 9, 0-8
icparnegt1
0 — 8
9 Year
Negative parental feelings scale, child-self-rated at 9, 0-8
icparnegt2
0 — 8
9 Year
ARBQ overall anxiety total (9 year parent), 0-50
ipanxt1
0 — 42
9 Year
ARBQ overall anxiety total (9 year parent), 0-50
ipanxt2
0 — 42
9 Year
Parent Chaos scale at 9, 0-12
ipchatot
0 — 12
9 Year
Child Non-verbal composite (10 year twin web), standardised
jcnv1
-5.2 — 2.45
10 Year
Child Non-verbal composite (10 year twin web), standardised
jcnv2
-5.2 — 2.45
10 Year
Child Verbal composite (10 year twin web), standardised
jcvb1
-4.14 — 3.07
10 Year
Child Verbal composite (10 year twin web), standardised
jcvb2
-4.14 — 3.07
10 Year
3-subject (English, maths, science) mean NC level (10 year teacher), 1-5
jt3ac1
1 — 5
10 Year
3-subject (English, maths, science) overall mean NC level from parent-reported school report at 12, 2-7
lp3ac1
2 — 7
12 Year
3-subject (English, maths, science) overall mean NC level from parent-reported school report at 12, 2-7
lp3ac2
2 — 7
12 Year
SDQ Hyperactivity scale (child self-report) at 12, 0-10
lcsdqhypt1
0 — 10
12 Year
SDQ Hyperactivity scale (child self-report) at 12, 0-10
lcsdqhypt2
0 — 10
12 Year
SDQ Conduct scale (child self-report) at 12, 0-10
lcsdqcont1
0 — 10
12 Year
SDQ Conduct scale (child self-report) at 12, 0-10
lcsdqcont2
0 — 10
12 Year
SDQ Hyperactivity scale (parent) at 12, 0-10
lpsdqhypt1
0 — 10
12 Year
SDQ Hyperactivity scale (parent) at 12, 0-10
lpsdqhypt2
0 — 10
12 Year
SDQ Conduct scale (parent) at 12, 0-10
lpsdqcont1
0 — 10
12 Year
SDQ Conduct scale (parent) at 12, 0-10
lpsdqcont2
0 — 10
12 Year
MFQ scale from 11 MFQ items (child self-report) at 12, 0-22
lcmfqt1
0 — 22
12 Year
MFQ scale from 11 MFQ items (child self-report) at 12, 0-22
lcmfqt2
0 — 22
12 Year
MFQ scale from 11 MFQ items (parent) at 12, 0-22
lpmfqt1
0 — 22
12 Year
MFQ scale from 11 MFQ items (parent) at 12, 0-22
lpmfqt2
0 — 22
12 Year
Conners ADHD overall scale (parent) at 12, 0-54
lpconnt1
0 — 54
12 Year
Conners ADHD overall scale (parent) at 12, 0-54
lpconnt2
0 — 54
12 Year
Parental Feelings negative subscale (child self-reported) at 12, 0-8
lcparnegt1
0 — 8
12 Year
Parental Feelings negative subscale (child self-reported) at 12, 0-8
lcparnegt2
0 — 8
12 Year
Chaos scale (parent-reported, from 5 items) at 12, 0-10
lpchatot
0 — 10
12 Year
Chaos scale (child self-reported, from 6 items) at 12, 0-12
lcchato1
0 — 12
12 Year
Chaos scale (child self-reported, from 6 items) at 12, 0-12
lcchato2
0 — 12
12 Year
End of KS3 3-subject Academic achievement mean level (from parent SLQ), 1-9
npks3t3a1
1 — 9
14 Year
End of KS3 3-subject Academic achievement mean level (from parent SLQ), 1-9
npks3t3a2
1 — 9
14 Year
Conners Inattention scale at 14 (child), 0-27
ncconint1
0 — 27
14 Year
Conners Inattention scale at 14 (child), 0-27
ncconint2
0 — 27
14 Year
Conners Hyperactivity-Impulsivity scale at 14 (child), 0-27
ncconhit1
0 — 27
14 Year
Conners Hyperactivity-Impulsivity scale at 14 (child), 0-27
ncconhit2
0 — 27
14 Year
Conners total scale at 14 (parent), 0-54
npconnt1
0 — 54
14 Year
Conners total scale at 14 (parent), 0-54
npconnt2
0 — 54
14 Year
Negative Parental Feelings scale at 14 (child), 0-8
ncparnegt1
0 — 8
14 Year
Negative Parental Feelings scale at 14 (child), 0-8
ncparnegt2
0 — 8
14 Year
Chaos at home total scale at 14 (child), 0-12
ncchato1
0 — 11
14 Year
Chaos at home total scale at 14 (child), 0-12
ncchato2
0 — 11
14 Year
SDQ Conduct scale (child behaviour qnr at 16), 0-10
pcbhsdqcont1
0 — 10
16 Year
SDQ Conduct scale (child behaviour qnr at 16), 0-10
pcbhsdqcont2
0 — 10
16 Year
SDQ Hyperactivity scale (child behaviour qnr at 16), 0-10
pcbhsdqhypt1
0 — 10
16 Year
SDQ Hyperactivity scale (child behaviour qnr at 16), 0-10
pcbhsdqhypt2
0 — 10
16 Year
SDQ Conduct scale (parent behaviour qnr at 16), 0-10
ppbhsdqcont1
0 — 10
16 Year
SDQ Conduct scale (parent behaviour qnr at 16), 0-10
ppbhsdqcont2
0 — 10
16 Year
SDQ Hyperactivity scale (parent behaviour qnr at 16), 0-10
ppbhsdqhypt1
0 — 10
16 Year
SDQ Hyperactivity scale (parent behaviour qnr at 16), 0-10
ppbhsdqhypt2
0 — 10
16 Year
ARBQ Anxiety overall total scale (parent behaviour qnr at 16), 0-38
ppbhanxt1
0 — 33.78
16 Year
ARBQ Anxiety overall total scale (parent behaviour qnr at 16), 0-38
ppbhanxt2
0 — 33.78
16 Year
Chaos total score (child web at 16), 0-12
pcchatot1
0 — 12
16 Year
Chaos total score (child web at 16), 0-12
pcchatot2
0 — 12
16 Year
SDQ Conduct total score (TEDS21 phase 1 twin qnr), 0-10
u1csdqcont1
0 — 9
21 Year
SDQ Conduct total score (TEDS21 phase 1 twin qnr), 0-10
u1csdqcont2
0 — 9
21 Year
SDQ Hyperactivity total score (TEDS21 phase 1 twin qnr), 0-10
u1csdqhypt1
0 — 10
21 Year
SDQ Hyperactivity total score (TEDS21 phase 1 twin qnr), 0-10
u1csdqhypt2
0 — 10
21 Year
SDQ Conduct total score (TEDS26 twin MHQ), 0-10
zmhsdqcont1
0 — 9
26 Year
SDQ Conduct total score (TEDS26 twin MHQ), 0-10
zmhsdqcont2
0 — 9
26 Year
SDQ Hyperactivity total score (TEDS26 twin MHQ), 0-10
zmhsdqhypt1
0 — 10
26 Year
SDQ Hyperactivity total score (TEDS26 twin MHQ), 0-10
zmhsdqhypt2
0 — 10
26 Year
G composite scale from child web tests at 12, standardised
lcg1
-3.67 — 3.04
12 Year
G composite scale from child web tests at 12, standardised
lcg2
-3.67 — 3.04
12 Year
G composite scale from child web tests at 14, standardised
ncg1
-4.12 — 3.17
14 Year
G composite scale from child web tests at 14, standardised
ncg2
-4.12 — 3.17
14 Year
G composite scale from child web tests at 16, standardised
pcg1
-2.86 — 4.06
16 Year
G composite scale from child web tests at 16, standardised
pcg2
-2.86 — 4.06
16 Year
G-game overall total score, 0-40
ucgt1
3 — 40
21 Year
G-game overall total score, 0-40
ucgt2
3 — 40
21 Year
End of KS3 all-subject Academic achievement mean level (from parent SLQ), 1-9
npks3tall1
1 — 9
14 Year
End of KS3 all-subject Academic achievement mean level (from parent SLQ), 1-9
npks3tall2
1 — 9
14 Year
Core subjects (English, maths, science): mean grade in GCSE results (twin exams at 16), 4-11
pcexgcsecoregrdm1
4 — 11
16 Year
Core subjects (English, maths, science): mean grade in GCSE results (twin exams at 16), 4-11
pcexgcsecoregrdm2
4 — 11
16 Year
Twin probable highest level of qualification including current study (TEDS21 phase 1 twin qnr), 1-11 see value labels
u1chqualp1
1 — 11
21 Year
Twin probable highest level of qualification including current study (TEDS21 phase 1 twin qnr), 1-11 see value labels
u1chqualp2
1 — 11
21 Year
Demographics item: highest qualification ordinal level (TEDS26 twin MHQ), see value labels
zmhhqual1
1 — 11
26 Year
Demographics item: highest qualification ordinal level (TEDS26 twin MHQ), see value labels
zmhhqual2
1 — 11
26 Year
MFQ total scale (child behaviour qnr at 16), 0-26
pcbhmfqt1
0 — 26
16 Year
MFQ total scale (child behaviour qnr at 16), 0-26
pcbhmfqt2
0 — 26
16 Year
MFQ overall total score (TEDS21 phase 1 twin qnr), 0-16
u1cmfqt1
0 — 16
21 Year
MFQ overall total score (TEDS21 phase 1 twin qnr), 0-16
u1cmfqt2
0 — 16
21 Year
MFQ overall total score (TEDS26 twin MHQ), 0-26
zmhmfqt1
0 — 26
26 Year
MFQ overall total score (TEDS26 twin MHQ), 0-26
zmhmfqt2
0 — 26
26 Year
General Anxiety overall total score (TEDS21 phase 2 twin qnr), 0-40
u2cganxt1
0 — 40
21 Year
General Anxiety overall total score (TEDS21 phase 2 twin qnr), 0-40
u2cganxt2
0 — 40
21 Year
GAD-D (General Anxiety) overall total score (TEDS26 twin MHQ), 0-40
zmhganxt1
0 — 40
26 Year
GAD-D (General Anxiety) overall total score (TEDS26 twin MHQ), 0-40
zmhganxt2
0 — 40
26 Year
Note: Externalising SDQ scores are created post-imputation by adding conduct and hyperactivity problem scales. Range or Level shows min—max values for numeric variables or factor levels for categorical variables (reference level marked with *). Variable codes with an asterisk (*) have been derived or modified from the original dataset.
Source Code
---title: "RQ5: complete case analysis vs imputation analysis"format: html: embed-resources: true fig-width: 8 fig-height: 8 code-link: true code-fold: true code-tools: true df-print: paged toc: true grid: body-width: 900pxeditor: sourceeditor_options: chunk_output_type: console---# Load datadocker container used: bignardig/tidyverse451:v7```{r}#| output: falsesource("0_load_data.R")bootstrap_summary_df =readRDS(file =file.path("results", "5_bootstrap_summary_df.Rds"))bootstrap_summary_df_ace =readRDS(file =file.path("results", "5_bootstrap_summary_df_ace.Rds"))boot_compare_results =readRDS(file =file.path("results", "5_boot_compare_results.Rds"))# ace_comparisons = readRDS(file = file.path("results", "5_ace_comparisons.Rds"))imputed_mice =readRDS(file.path("results","5_1_all_imputations.Rds")) # Less-cleaned MICE output (useful for getting diagnostics...)# original_variances = sapply(rq5y, function(v) sd(df[[v]], na.rm = TRUE)^2)bootstrap_summary_df = bootstrap_summary_df %>%group_by(parameter) %>%mutate(pval_adj = stats::p.adjust(pval, method ="holm") )```## Load imputed dataset (cominbed)```{r}# Load the imputed datasetdf_rq5_imputed <-readRDS(file.path("data", "df_rq5_imputed.Rds")) # Cleaned and put-together imputed data# Sample sizedf_rq5_imputed %>%filter(.imp ==1) %>%nrow()# Create comparison datasets - only rq5y variablesoriginal_dataset <- df %>%filter(!(randomfamid %in% exclude_fams_onesib)) %>%filter(!(randomfamid %in% rq5_exclude_fams)) %>%filter(!(randomfamid %in% rq5_exclude_fams_2)) %>%# Exclude fams with less than 30% data on all imputed data (excluding baseline data)select(all_of(rq5y)) # imputed_dataset <- df_rq5_imputed %>%# select(all_of(rq5y))```# Descriptive stats## Missingness plots```{r}df %>%select(amohqualn,all_of(rq5y)) %>%`colnames<-`(c("Y1: Mother Education", rq5y_labels_short)) %>%# select(rq5y) %>%# `colnames<-`(rq5y_labels_short) %>%as.data.frame() %>%# Note that this is needed for function to work - could improve? gbtoolbox::plot_correlations(confidence_interval =FALSE,sample_size =FALSE )save_plot("5_3_descriptive_plot_correlations_rq5y", width =5.3, height =5.3)df %>%select(amohqualn,all_of(rq5y)) %>%`colnames<-`(c("Y1: Mother Education", rq5y_labels_short)) %>%as.data.frame() %>% gbtoolbox::plot_missing_correlations(n_decimal_places =2,cluster_variables =FALSE )save_plot("5_3_descriptive_plot_missing_correlations_rq5y", width =5.3, height =5.3)# takes a long time to run with all the vairables in if(FALSE){df %>%select(any_of(c(rq5y, rq5z))) %>%select(where(is.numeric)) %>%select(rq5y, everything()) %>%as.data.frame() %>% gbtoolbox::plot_missing_correlations(cluster_variables =FALSE, textadjust =0) }``````{r}#| include: false# Variable check function.variable_check =function(x, name =NULL){if (!is.null(name)) { var_name <- name } else { var_name <-deparse(substitute(x)) }cat("Variable:", var_name, "\n")cat("Class:", class(x), "\n")cat("Label:", attr(x,"label"), "\n")print(table(x, useNA ="always"))cat("\n")return(invisible(NULL))}# # Check variable characteristics (subset for output)# df %>%# select(any_of(rq5z)) %>%# mutate_if(is.numeric, ~ round(.x, 1)) %>%# # select(150:250) %>%# purrr::imap(~.variable_check(.x, .y))# Check number of unique values per variabledf %>%select(any_of(rq5z)) %>%mutate_if(is.numeric, ~round(.x, 1)) %>%apply(.,2, function(x) length(unique(x))) %>%table()```## Missing data frequency plot[Click here for full-size plot](plots/5_3_missing_data_frequency.pdf){target="_blank"}```{r}df %>%select(any_of(rq5z)) %>%select(!ends_with("2")) %>%# because the data is in a long format, we don't need the twin 2 variables! apply(.,2,function(x) length(which(!is.na(x)))/length(x)) %>%as.data.frame() %>%rownames_to_column() %>%`colnames<-`(c("var","percent_notmissing")) %>%mutate(# var = factor(var, levels = rq5z),var_label =sapply(var, function(x) ifelse(!is.null(var_to_label(x)[[1]]), var_to_label(x), x)), var_label =paste0(var_label, " (", var, ")"),var_label =factor(var_label, levels = var_label) ) %>%#pull(var_label) %>% duplicated() %>% table()arrange(percent_notmissing) %>%ggplot(aes(x = percent_notmissing, y = var_label)) +geom_col() +geom_vline(aes(xintercept=.2)) +theme_bw() +labs(x="Percent of not-missing data", y =NULL)# save_plot("5_3_descriptive_missing_data_frequency_auxillaryvars", width =12, height =32)df %>%select(any_of(rq5y)) %>%# filter(rowSums(!is.na(.)) > 0) %>% apply(.,2,function(x) length(which(!is.na(x)))/length(x)) %>%as.data.frame() %>%rownames_to_column() %>%`colnames<-`(c("var","percent_notmissing")) %>%mutate(var_label =factor(var, levels =rev(rq5y), labels =rev(rq5y_labels_short))) %>%ggplot(aes(x = percent_notmissing, y = var_label)) +geom_col() +geom_vline(aes(xintercept=.2)) +geom_text(aes(label =paste0(round(percent_notmissing*100),"%")), hjust=1.2) +theme_bw() +labs(x="Percent of not-missing data", y =NULL)save_plot("5_3_descriptive_missing_data_frequency", width =12, height =32)df %>%select(cohort, any_of(rq5y)) %>%pivot_longer(cols =-cohort, names_to ="var", values_to ="value") %>%mutate(var =factor(var, levels =rev(rq5y), labels =rev(rq5y_labels_short))) %>%group_by(cohort, var) %>%summarise(total_n = dplyr::n(),not_missing_n =sum(!is.na(value)),percent_notmissing = not_missing_n / total_n,.groups ="drop" ) %>%mutate(# var_label = factor(var, levels = unique(var), labels = var_to_label(unique(var))),cohort =factor(cohort,levels =c("Cohort 4: twins born Sep-96 to Dec-96","Cohort 3: twins born Sep-95 to Aug-96","Cohort 2: twins born Sep-94 to Aug-95","Cohort 1: twins born Jan-94 to Aug-94"),labels =c("4", "3", "2", "1") ) ) %>%ggplot(aes(x = percent_notmissing, y = var, fill = cohort)) +geom_col(position =position_dodge(width =0.8), width =0.7) +geom_vline(aes(xintercept =0.2), linetype ="dashed") +geom_text(aes(label =paste0(round(percent_notmissing*100), "%")),position =position_dodge(width =0.8),hjust =-.2,size =3 ) +theme_bw() +labs(x ="Percent of not-missing data BY COHORT",y =NULL,fill ="Cohort" ) +theme(legend.position ="bottom",axis.text.y =element_text(size =8) ) +guides(fill =guide_legend(reverse =TRUE))save_plot("5_3_descriptive_missing_data_frequency_bycohort", width =8, height =13)```## Descriptive table of imputation variables```{r} get_levels_summary <-function(x) {if (is.factor(x)) {paste(levels(x), collapse =", ") } else { unique_vals <-na.omit(unique(x))if (length(unique_vals) <=6) {paste(unique_vals, collapse =", ") } else {# paste(length(unique_vals), "unique values")paste("") } } }# List of all variables for imputationdf_rq5 = df %>%select(any_of(rq5z))# Descriptive information on each imputed variableimpute_df =data.frame(var =colnames(df_rq5),label =as.character(sapply(df_rq5, function(x) attr(x, "label"))),levels =map_chr(df_rq5, get_levels_summary),class =as.character(sapply(df_rq5, function(x) class(x))),perc_not_missing =as.numeric(sapply(df_rq5, function(x) round(length(which(!is.na(x)))/length(x)*100))),sd =round(as.numeric(sapply(df_rq5, function(x) sd(as.numeric(x), na.rm =TRUE) )),2),distinct_categories =as.numeric(sapply(df_rq5, function(x) length(na.omit(unique(x))))))impute_df <- impute_df %>%mutate(variable_year =case_when(str_starts(var, "a") ~"Year 1 (1st Contact)",str_starts(var, "b") ~"Year 2",str_starts(var, "c") ~"Year 3",str_starts(var, "d") ~"Year 4",str_starts(var, "g") ~"Year 7",str_starts(var, "h") ~"Year 8",str_starts(var, "i") ~"Year 9",str_starts(var, "j") ~"Year 10",str_starts(var, "l") ~"Year 12",str_starts(var, "n") ~"Year 14",str_starts(var, "p") ~"Year 16",str_starts(var, "r") ~"Year 18",str_starts(var, "u") ~"Year 21",str_starts(var, "z") ~"Year 26",TRUE~"Other" ))impute_df %>%arrange(perc_not_missing) %>% knitr::kable()```## Missing data proportions by groupInterestingly, MZ twins have more missing data than DZ twins ```{r}# Check proportion of missing data by sexzyg groupdf %>%select(sexzyg, any_of(rq5y)) %>%group_by(sexzyg) %>%summarise(n = dplyr::n(),total_cells = dplyr::n() *length(rq5y),missing_cells =sum(is.na(c_across(any_of(rq5y)))),overall_prop_missing = missing_cells / total_cells,.groups ="drop" ) %>% knitr::kable(digits =2)# Gives the same results! # df %>%# select(sexzyg, any_of(rq5y)) %>%# pivot_longer(cols = any_of(rq5y)) %>%# group_by(sexzyg) %>%# summarise(# total_cells = length(value),# missing_cells = sum(is.na(value)),# overall_prop_missing = missing_cells / total_cells,# .groups = "drop"# ) %>%# knitr::kable(digits =2)```## Number of missing cells per participant for rq5y variablesFor the variables `r var_to_label(rq5y)`, how many cells is each participant missing. ```{r}df %>%select(any_of(rq5y)) %>%mutate(missing_count =rowSums(is.na(.))) %>%select(missing_count) %>%count(missing_count) %>%mutate(percent =round(n /sum(n) *100, 1),total_vars =length(rq5y) ) %>% knitr::kable(col.names =c("# Missing Variables Per Pps", "N pps", "% pps", "Total Variables"),caption ="How many missing cells does each participant have on the key outcome variables? " )```## Missing data flux```{r}df %>%select(any_of(c(rq1x,rq5z))) %>% mice::fluxplot()```## Plot illustrating showing how imputation works```{r}set.seed(10)df_rq5_imputed %>%filter(randomtwinid2 %in%sample(.$randomtwinid2,30)) %>%mutate(id =factor(randomtwinid2, labels =paste0("pps",1:length(unique(.$randomtwinid2))))) %>%mutate(value = lcg1) %>%ggplot(aes(x = value)) +geom_histogram(bins =10) +facet_wrap(~id, ncol =5, scales ="fixed") +labs(y ="General Cognitive Ability scores at age 12 (standardised; lcg1)",# title = NULL,x =NULL ) +theme_bw()save_plot("5_3_descriptive_imputation_distribution.pdf", width =9, height =6)df$lcg1 %>%sd(na.rm =T)```# Changes in means and variances## GT table```{r}library(gt)bootstrap_summary_df %>%filter(parameter %in%c("md", "smd", "var")) %>%ungroup() %>%mutate(outcome = rq5y_labels_short[match(.$outcome, rq5y)]) %>%# group_by(parameter) %>%# mutate(pval_adj = stats::p.adjust(pval, method = "holm")) %>%select(-pd,-pval, -n) %>%select(-starts_with(".")) %>%pivot_wider(values_from =c("y","ymin","ymax","pval_adj"),names_from =c("parameter") ) %>%# Arrange columns in order: md, smd, varselect(outcome, y_md, pval_adj_md, ymin_md, ymax_md, y_smd, pval_adj_smd, ymin_smd, ymax_smd, y_var, pval_adj_var, ymin_var, ymax_var) %>%gt() %>%fmt(columns =!contains("parameter") &!contains("outcome"),fns =function(x) {gbtoolbox::apa_num(x, n_decimal_places =3)} ) %>%fmt(columns ="pval_adj_var",fns =function(x) {gbtoolbox::apa_num(x, n_decimal_places =3)} ) %>%fmt_percent(columns =c("y_var", "ymin_var", "ymax_var"),decimals =2,drop_trailing_zeros =FALSE,drop_trailing_dec_mark =FALSE ) %>%# Format numeric columns# fmt_number(columns = starts_with("y_"), decimals = 3) %>%# fmt_number(columns = starts_with("ymin_"), decimals = 3) %>%# fmt_number(columns = starts_with("ymax_"), decimals = 3) %>%# fmt_scientific(columns = starts_with("pval_adj_"), decimals = 3)# Color code based on significance - separate rules for each statistictab_style(style =list(cell_fill(color ="#ffcccc")),locations =cells_body(columns =c(y_md, pval_adj_md, ymin_md, ymax_md),rows = pval_adj_md <0.05& y_md <0 ) ) %>%tab_style(style =list(cell_fill(color ="#ccffcc")),locations =cells_body(columns =c(y_md, pval_adj_md, ymin_md, ymax_md),rows = pval_adj_md <0.05& y_md >0 ) ) %>%tab_style(style =list(cell_fill(color ="#ffcccc")),locations =cells_body(columns =c(y_smd, pval_adj_smd, ymin_smd, ymax_smd),rows = pval_adj_smd <0.05& y_smd <0 ) ) %>%tab_style(style =list(cell_fill(color ="#ccffcc")),locations =cells_body(columns =c(y_smd, pval_adj_smd, ymin_smd, ymax_smd),rows = pval_adj_smd <0.05& y_smd >0 ) ) %>%tab_style(style =list(cell_fill(color ="#ffcccc")),locations =cells_body(columns =c(y_var, pval_adj_var, ymin_var, ymax_var),rows = pval_adj_var <0.05& y_var <0 ) ) %>%tab_style(style =list(cell_fill(color ="#ccffcc")),locations =cells_body(columns =c(y_var, pval_adj_var, ymin_var, ymax_var),rows = pval_adj_var <0.05& y_var >0 ) ) %>%# Add column labelscols_label(outcome ="Variable",y_md ="Est", pval_adj_md ="p", ymin_md ="LB", ymax_md ="UB",y_smd ="Est", pval_adj_smd ="p", ymin_smd ="LB", ymax_smd ="UB", y_var ="Est", pval_adj_var ="p", ymin_var ="LB", ymax_var ="UB" ) %>%# Add overarching CI headerstab_spanner(label ="95% CI", columns =c(ymin_md, ymax_md), id ="ci_md") %>%tab_spanner(label ="95% CI", columns =c(ymin_smd, ymax_smd), id ="ci_smd") %>%tab_spanner(label ="95% CI", columns =c(ymin_var, ymax_var), id ="ci_var") %>%# Add spanning headers for each statistic with formulas - simple formattab_spanner(label =md("Mean Difference<br>X̄<sub>imputed</sub> - X̄<sub>unimputed</sub>"), columns =c(y_md, pval_adj_md, ymin_md, ymax_md), id ="md_main") %>%tab_spanner(label =md("Standardized Mean Difference<br>(X̄<sub>imputed</sub> - X̄<sub>unimputed</sub>) /<br>σ<sub>unimputed</sub>"), columns =c(y_smd, pval_adj_smd, ymin_smd, ymax_smd), id ="smd_main") %>%tab_spanner(label =md("Variance % Change<br>(σ²<sub>imputed</sub> - σ²<sub>unimputed</sub>) /<br>σ²<sub>unimputed</sub> × 100"), columns =c(y_var, pval_adj_var, ymin_var, ymax_var), id ="var_main") %>%# Add footnote for p-valuestab_footnote(footnote ="P values are Bonferroni-Holm adjusted within each statistic type (Mean Difference, SMD, Variance Difference, etc.",locations =cells_column_labels(columns =contains("pval_adj")),placement ="right" ) %>%# Style table - uniform formatting for spannerstab_style(style =cell_text(size =px(10), v_align ="middle"),locations =cells_column_spanners(spanners =c("md_main", "smd_main", "var_main")) ) %>%tab_options(column_labels.padding =px(0),table.font.size =px(9) ) %>%# Standardize column widthscols_width( outcome ~px(80),c(y_md, y_smd, y_var) ~px( 45),c(pval_adj_md, pval_adj_smd, pval_adj_var) ~px(38),c(ymin_md, ymax_md, ymin_smd, ymax_smd, ymin_var, ymax_var) ~px(45) )# tab_style(# style = cell_borders(sides = "right", color = "gray", weight = px(1)),# locations = cells_body(columns = c(ymax_md, ymax_smd))# )```# Changes in correlations```{r}#| output: falsetest_correlation_matrix =matrix(nrow =length(rq5y),ncol =length(rq5y))for(i inseq_along(rq5y)){for(j inseq_along(rq5y)){ test_correlation_matrix[i,j] =paste(rq5y[i], rq5y[j], sep ="-") }}vars = test_correlation_matrix[lower.tri(test_correlation_matrix, diag =FALSE)] x_var =str_extract(vars, "^[^-]+") y_var =str_extract(vars, "[^-]+$") missingcode =paste0("missing",1:length(vars))cor_df =do.call(rbind, lapply(seq_along(boot_compare_results), function(i) { result_df =t(as.data.frame(boot_compare_results[[i]]$cor_resid)) result_df =data.frame(.imp = i, .boot =1:nrow(result_df), result_df, row.names =NULL)return(result_df)}))bootstrap_summary_cor =apply(select(cor_df,-c(.imp,.boot)),2, function(xx) .mean_qi_pd(xx)) %>%bind_rows() %>%ungroup() %>%mutate(vars = vars,x_var = x_var,y_var = y_var,pval_adj = stats::p.adjust(pval, method ="holm") )if (length(vars) !=nrow(bootstrap_summary_cor)) stop("error")bootstrap_summary_cor %>%arrange(desc(abs(y)))```## Plot of changes in correlations```{r}bootstrap_summary_cor %>%# filter(x_var!=y_var) %>% # These correlations should probs be removed from bootstrap code! plot_lower_triangular_matrix2(variables = rq5y,labels = rq5y_labels_short,method ="none" ) +labs(title ="Correlation Change",subtitle =expression(r[imputed] - r[original]),caption =NULL,tag ="B2" ) +theme(plot.title =element_text(hjust =0.5, size =16),plot.subtitle =element_text(hjust =0.5, size =13.5, margin =margin(b =0)),plot.tag =element_text(hjust =0, vjust =0, size =30, face ="bold"),plot.tag.position ="topleft",panel.border =element_rect(color ="black", fill =NA, linewidth =1),plot.background =element_blank() ) +scale_fill_gradient2(low ="#0571b0",mid ="white",high ="#ca0020",midpoint =0,limits =c(-.185, .185), # Set min and max herena.value ="white" ) save_plot("5_correlation_residuals", width =8, height =8, trim =2)```[📊 View Plot](/plots/pngs/5_correlation_residuals.png)## GT Results table```{r}cor_table = bootstrap_summary_cor %>%mutate(x_var_label = rq5y_labels_short[match(x_var, rq5y)],y_var_label = rq5y_labels_short[match(y_var, rq5y)],pair =paste(x_var_label, "×", y_var_label) ) %>%select(pair, y, ymin, ymax, pval, pval_adj) %>%gt() %>%tab_options(table.width =px(800) ) %>%# Rename columnscols_label(pair ="Variable Pair",y ="Est",ymin ="LB",ymax ="UB",pval =md("p<sub>unadj</sub>"),pval_adj =md("p<sub>adj</sub>") ) %>%# Create column spannertab_spanner(label ="Correlation Difference",columns =c(y, ymin, ymax, pval, pval_adj) ) %>%# Format numbersfmt(columns =c(y, ymin, ymax),fns =function(x) {gbtoolbox::apa_num(x, n_decimal_places =3)} ) %>%fmt(columns =c(pval, pval_adj),fns =function(x) {gbtoolbox::apa_num(x, n_decimal_places =3)} ) %>%# Styling - uniform font sizetab_style(style =cell_text(size =px(10)),locations =cells_column_spanners() ) %>%tab_style(style =cell_text(size =px(10)),locations =cells_body() ) %>%tab_style(style =cell_text(size =px(10)),locations =cells_column_labels() ) %>%tab_style(style =cell_text(size =px(10)),locations =cells_footnotes() ) %>%# Highlight significant results - positive effects (light green)tab_style(style =cell_fill(color ="#d5e8d4"),locations =cells_body(columns =c(y, ymin, ymax, pval, pval_adj),rows = pval_adj <0.05& y >0 ) ) %>%# Highlight significant results - negative effects (light red)tab_style(style =cell_fill(color ="#f8cecc"),locations =cells_body(columns =c(y, ymin, ymax, pval, pval_adj),rows = pval_adj <0.05& y <0 ) ) %>%# Add footnotestab_footnote(footnote =md("<em>Note.</em> Est = Estimate, LB = Lower Bound 95% Confidence Interval, UB = Upper Bound 95% Confidence Interval. Significant (p<sub>Bonferroni-Holm</sub>) effects are highlighted in green (increases) or red (decreases)."),placement ="right" ) %>%tab_footnote(footnote ="P values are Bonferroni-Holm adjusted",locations =cells_column_labels(columns = pval_adj),placement ="right" ) %>%tab_footnote(footnote =md("Correlation difference = Cor<sub>imputed</sub> - Cor<sub>original</sub>"),locations =cells_column_spanners(spanners ="Correlation Difference"),placement ="right" ) %>%opt_footnote_marks(marks =c("*", "†", "‡"))cor_table```## plot of unimputed correlationsTHIS PLOT NEEDS TO BE UPDATED AS WE DIDN"TINCLUDE ALL PARTICIPANTS IN df IN THE ANALYSES```{r}df %>%filter(!(randomfamid %in% exclude_fams_onesib)) %>%# Exclude fams with one sub in the studyfilter(!(randomfamid %in% rq5_exclude_fams)) %>%# Exclude fams with 0 data on rq5y variables (key outcomes)filter(!(randomfamid %in% rq5_exclude_fams_2)) %>%select(all_of(rq5y)) %>%data.frame() %>%`colnames<-`(rq5y_labels_short) %>% gbtoolbox::plot_correlations(confidence_interval =FALSE,text_size_r =2.5,abs_colour =FALSE ) +labs(title ="Unimputed Pairwise correlations",subtitle ="Below Diagonal: Correlations. Diagonal: Univariate Sample Sizes. Above Diagonal: Pairwise Sample Sizes",caption =NULL,tag ="B1" ) +theme(plot.title =element_text(hjust =0.5, size =16, face ="plain"),plot.subtitle =element_text(hjust =0, size =8, margin =margin(b =0)),plot.caption =element_text(hjust =1, size =8),plot.tag =element_text(hjust =0, vjust =0, size =30, face ="bold"),plot.tag.position ="topleft",plot.background =element_blank() ) +scale_fill_gradient2(low ="#0571b0",mid ="white",high ="#ca0020",midpoint =0,limits =c(-.9, .9), # Set min and max herena.value ="white" ) save_plot("5_correlations_unimputed", width =8, height =8, trim =TRUE)```[📊 View Plot](/plots/pngs/5_correlations_unimputed.png)# Changes in ACE estimates## Count of significant ACE differences (unadjusted)```{r}bootstrap_summary_df_ace %>%filter(group =="Difference", par %in%c("a", "c", "e")) %>%summarise(total_tests = dplyr::n(),sig_unadj =sum(pval <0.05),pct_sig =round(sig_unadj / total_tests *100, 1) )```## Largest change```{r}bootstrap_summary_df_ace %>%filter(group =="Difference", par %in%c("a", "c", "e")) %>%pull(y) %>%max(abs(.))```## Table```{r}# bootstrap_summary_df_ace %>%# filter(group == "Difference") %>% # # group_by(par) %>%# mutate(# pval_adj = stats::p.adjust(pval, method = "holm"),# name = rq5y_labels_short[match(name, rq5y)]# ) %>%# ungroup() %>%# arrange(pval_adj) %>%# knitr::kable()bootstrap_summary_df_ace %>%filter(par %in%c("a","c","e")) %>%group_by(group) %>%mutate(pval_adj = stats::p.adjust(pval, method ="holm"), ) %>%ungroup() %>%select(-starts_with("."), -n, -pd) %>%mutate(name_clean = rq5y_labels_short[match(name, rq5y)]) %>%pivot_wider(id_cols =c(par, name_clean, sex),names_from = group,values_from =c(y, ymin, ymax, pval, pval_adj),names_sep ="_" ) %>%select(-ymin_Original, -ymax_Original, -ymin_Imputed, -ymax_Imputed, -pval_Original, -pval_Imputed, -pval_adj_Original, -pval_adj_Imputed) %>%select(name_clean, par, y_Original, y_Imputed, y_Difference, ymin_Difference, ymax_Difference, pval_Difference, pval_adj_Difference, sex) %>%arrange(sex, name_clean, par, y_Imputed) %>%mutate(par =toupper(par)) %>%gt(groupname_col ="sex") %>%fmt_number(decimals =3) %>%tab_row_group(label ="Male",rows = sex =="male" ) %>%tab_row_group(label ="Female",rows = sex =="female" ) %>%cols_hide(sex) %>%cols_label(name_clean ="Variable",par ="",y_Original ="Original",y_Imputed ="Imputed",y_Difference ="Diff",ymin_Difference ="Lower",ymax_Difference ="Upper",pval_Difference =md("p<sub>unadj</sub>"),pval_adj_Difference =md("p<sub>adj</sub>") ) %>%tab_spanner(label ="95% CI",columns =c(ymin_Difference, ymax_Difference) ) %>%tab_spanner(label ="Estimates",columns =c(y_Imputed, y_Original) ) %>%tab_style(style =cell_fill(color ="lightgreen"),locations =cells_body(columns =everything(),rows = pval_adj_Difference <0.05 ) ) %>%tab_footnote(footnote ="P values are Bonferroni-Holm adjusted within each group (Original, Imputed, Difference)",locations =cells_column_labels(columns = pval_adj_Difference) )```## Plot```{r}bootstrap_summary_df_ace %>%filter(par %in%c("a","c","e")) %>%filter(group !="Difference") %>%mutate(name =factor(rq5y_labels_short[match(name, rq5y)], levels = rq5y_labels_short),group =ifelse(group =="Imputed", "I", "O"),group =factor(group, levels =c("I", "O")),sex =str_to_title(sex),par =toupper(par) ) %>%ggplot(aes(x = group, y = y, fill = par)) +geom_col(position ="stack", alpha =1) +facet_grid(sex ~ name, switch ="x") +theme_minimal() +theme(axis.text.x =element_text(angle =0, hjust = .5, size =10),strip.text =element_text(size =10),strip.text.x =element_text(angle =90, hjust =1, vjust = .5, size =10),strip.text.y =element_text(angle =0, size =11),strip.placement ="outside",panel.grid =element_blank(),plot.tag =element_text(hjust =0, vjust =0, size =24, face ="bold"),plot.tag.position ="topleft" ) +labs(title ="Imputed (I) or Original (Non-Imputed; O) ACE Estimates",y ="Proportion",x =NULL,fill ="Component",tag ="B" ) +scale_fill_manual(values =c("A"="#d73027", "C"="#fee08b", "E"="#4575b4")) +coord_cartesian(ylim =c(0, 1))save_plot("5_ace_comparison", width =8, height =5, trim =TRUE)```# Supplementary Tables & Plots## Comparison of sample size pre- and post-imputation```{r}p1 = df %>%select(all_of(rq5y)) %>%`colnames<-`(c(rq5y_labels_short)) %>%# select(rq5y) %>%# `colnames<-`(rq5y_labels_short) %>%as.data.frame() %>%# Note that this is needed for function to work - could improve? gbtoolbox::plot_correlations(confidence_interval =FALSE,sample_size =TRUE,text_size_r =1.5,text_size_axis =8.5 ) +labs(title ="Original (non-imputed data)")p2 = df_rq5_imputed %>%filter(.imp ==1) %>%select(all_of(rq5y)) %>%`colnames<-`(c(rq5y_labels_short)) %>%# select(rq5y) %>%# `colnames<-`(rq5y_labels_short) %>%as.data.frame() %>%# Note that this is needed for function to work - could improve? gbtoolbox::plot_correlations(confidence_interval =FALSE,sample_size =TRUE,text_size_r =1.5,text_size_axis =8.5 ) +labs(title ="Imputed data")p1 + p2save_plot("5_3_correlation_comparison", width =5.3*2, height =5.3)df_rq5_imputed %>%filter(.imp ==1) %>%sapply(., function(x) length(which(!is.na(x))))# df_rq5_imputed$ncg1```## Check for convergence of algorithm ```{r}imputed_mice %>%length()plot(imputed_mice[[4]], layout =c(4,6))# plot(imputed_mice[[3]], y = rq5y, layout = c(4,6))rq5y_labels_short[!(rq5y %in%colnames(imputed_mice[[1]]$data))]```## Table of all variables for imputation```{r}# Function to map variable prefix to study waveget_study_wave =function(var_name) { prefix =substr(var_name, 1, 1) wave_map =c("a"="1st Contact","b"="2 Year", "c"="3 Year","d"="4 Year","e"="In Home","g"="7 Year","h"="8 Year", "i"="9 Year","j"="10 Year","l"="12 Year","n"="14 Year","p"="16 Year","r"="18 Year","u"="21 Year","z"="26 Year" )return(wave_map[prefix])}impute_vars =readRDS(file.path("results","5_1_imputation_variables.Rds"))impute_vars_labels =var_to_label(impute_vars) %>%sapply(., function(x) ifelse(is.null(x[1]), "", x[1]))# Create formatted table for imputation variablesv_impute =data.frame(Description = impute_vars_labels,`Teds Code`=ifelse(impute_vars %in% original_colnames, impute_vars, paste0(impute_vars,"*")),`Range or Level`=sapply(impute_vars, function(var) {if (var %in%colnames(df)) {if (class(df[[var]]) =="numeric") {paste0(round(min(df[[var]], na.rm =TRUE), 2), " — ", round(max(df[[var]], na.rm =TRUE), 2)) } elseif (is.factor(df[[var]])) { factor_levels =levels(df[[var]])paste(c(paste0(factor_levels[1],"*"), factor_levels[-1]), collapse =", ") } else {paste(unique(df[[var]]), collapse =", ") } } else {"Variable not found" } }),N =sapply(impute_vars, function(var) {if (var %in%colnames(df)) {sum(!is.na(df[[var]])) } else {0 } }),`Study.Wave`=sapply(impute_vars, get_study_wave))# Clean up descriptions# v_impute$Description = str_remove(v_impute$Description, "\\(1st.*")# v_impute$Description = str_remove(v_impute$Description, "\\(2.*")# Manual edit for specific variablesv_impute$`Study.Wave`[v_impute$`Teds.Code`=="cens01pop98density"] ="1st Contact"v_impute$`Study.Wave`[v_impute$`Teds.Code`=="pollution1998pca"] ="1st Contact"# Add row group information# v_impute_indexed = cbind(row_group = "Imputation Variables", row_id = 1:nrow(v_impute), v_impute)gt(v_impute) %>%# tab_row_group(# label = "Imputation Variables",# rows = row_group == "Imputation Variables"# ) %>%# cols_hide(c(row_group, row_id)) %>%cols_label_with(fn =~gsub("\\.", " ", .x)) %>%tab_style(style =cell_text(size =px(12)),locations =cells_body() ) %>%tab_style(style =cell_text(size =px(12)),locations =cells_column_labels() ) %>%tab_style(style =cell_text(size =px(12)),locations =cells_row_groups() ) %>%cols_hide(N) %>%tab_source_note(source_note ="Note: Externalising SDQ scores are created post-imputation by adding conduct and hyperactivity problem scales. Range or Level shows min—max values for numeric variables or factor levels for categorical variables (reference level marked with *). Variable codes with an asterisk (*) have been derived or modified from the original dataset." ) %>%tab_options(table.width ="70%" ) %>%cols_width( Description ~px(300),everything() ~px(120) )```